library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(gganimate)
library(gifski)
library(png)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
library(geosphere)
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
#Weather data already incorporated into SampleCitiBike.csv
rawdata <- read.csv("SampleCitiBike.csv")
sampleData <- sample_frac(rawdata, 0.01)
write.csv(sampleData, "new_MASTER_01_data.csv")
masterdata <- read.csv("new_MASTER_01_data.csv")
masterdata$X <- NULL
masterdata$starttime <- NULL
masterdata$stoptime <- NULL
masterdata$start.station.id <- as.factor(masterdata$start.station.id)
masterdata$start.station.name <- as.factor(masterdata$start.station.name)
masterdata$end.station.id <- as.factor(masterdata$end.station.id)
masterdata$end.station.name <- as.factor(masterdata$end.station.name)
masterdata$bikeid <- as.factor(masterdata$bikeid)
masterdata$usertype <- as.factor(masterdata$usertype)
masterdata <- rename(masterdata, startTime = newStartTime, stopTime = newStopTime)
masterdata$startTime <- as.POSIXct(strptime(masterdata$startTime, "%Y-%m-%d %H:%M:%S"))
masterdata$stopTime <- as.POSIXct(strptime(masterdata$stopTime, "%Y-%m-%d %H:%M:%S"))
masterdata$startDate <- as.Date(masterdata$startTime)
masterdata$stopDate <- as.Date(masterdata$stopTime)
masterdata$distMeters <- distHaversine(cbind(masterdata$start.station.latitude, masterdata$start.station.longitude), cbind(masterdata$end.station.latitude, masterdata$end.station.longitude))
masterdata$ageGroup <- as.factor(ifelse(masterdata$birth.year >= 2000, "GenZ", ifelse(masterdata$birth.year >= 1981, "Millennial", ifelse(masterdata$birth.year >= 1965, "GenX", ifelse(masterdata$birth.year >= 1946, "Boomer", ifelse(masterdata$birth.year >= 1928, "Silent", "VeryOld"))))))
masterdata$ageGroup <- factor(masterdata$ageGroup, levels = c("GenZ", "Millennial", "GenX", "Boomer", "Silent", "VeryOld"))
masterdata$startMonth <- month(masterdata$startDate)
masterdata$stopMonth <- month(masterdata$stopDate)
masterdata$startMonthFactor <- as.factor(month(masterdata$startDate))
masterdata$stopMonthFactor <- as.factor(month(masterdata$stopDate))
masterdata$seasonStart <- as.factor(ifelse(masterdata$startMonth >= 3 & masterdata$startMonth <= 5, "Spring", ifelse(masterdata$startMonth >= 6 & masterdata$startMonth <= 8, "Summer", ifelse(masterdata$startMonth >= 9 & masterdata$startMonth <= 11, "Fall", "Winter"))))
masterdata$seasonStart <- factor(masterdata$startMonth, levels = c("Spring", "Summer", "Fall", "Winter"))
masterdata$numWeekday <- as.factor(wday(masterdata$startDate))
#Defining rush hour as 6-10AM and 4-8PM
masterdata$rushHour <- as.factor(ifelse(masterdata$numWeekday == 1 | masterdata$numWeekday == 7, "No", ifelse(hour(masterdata$startTime) < 6 | hour(masterdata$startTime) > 10 & hour(masterdata$startTime) < 16 | hour(masterdata$startTime) > 20, "No", "Yes")))
masterdata <- rename(masterdata, maxTemp = TMAX, minTemp = TMIN)
masterdata$weekNum <- as.numeric(strftime(masterdata$startDate, format = "%V"))
masterdata$speedMetersperSec <- masterdata$distMeters / masterdata$tripduration
masterdata <- rename(masterdata, avgTemp = TAVG)
masterdata$tempFeel <- as.factor(ifelse(masterdata$maxTemp < 40, "Frigid", ifelse(masterdata$maxTemp < 58, "Cold", ifelse(masterdata$maxTemp < 65, "Cool", ifelse(masterdata$maxTemp < 75, "Warm", ifelse(masterdata$maxTemp < 95, "Hot", "Blazing"))))))
masterdata$tempFeel <- factor(masterdata$tempFeel, levels = c("Frigid", "Cold", "Cool", "Warm", "Hot", "Blazing"))
masterdata$gender <- as.factor(ifelse(masterdata$gender == "0", "Unknown", ifelse(masterdata$gender == "1", "Male", "Female")))
masterdata$timeOfDay <- ifelse(hour(masterdata$startTime) >= 0 & hour(masterdata$startTime) < 12, "morning", ifelse(hour(masterdata$startTime) >= 12 & hour(masterdata$startTime) <=24, "afternoon","night"))
masterdata$roundedSNOW <- floor(masterdata$SNOW)
masterAM <- filter(masterdata, masterdata$timeOfDay == "morning")
masterPM <- filter(masterdata,masterdata$timeOfDay == "afternoon")
weekdayData <- subset(masterAM, subset = (masterAM$numWeekday != "7" & masterAM$numWeekday != "1"))
weekendData <- subset(masterAM, subset = (masterAM$numWeekday == "7" | masterAM$numWeekday == "1"))
#Ride Distance by Maximum Temperature by Gender
ggplot(data = masterdata, aes(x = maxTemp, y = distMeters, colour = gender)) + geom_point() + labs(x = "Maximum Temperature", y = "Ride Distance (meters)", colour = "Gender")
ggplot(data = masterdata, aes(x = maxTemp, y = distMeters, colour = gender)) + geom_smooth() + labs(x = "Maximum Temperature", y = "Ride Distance (meters)", colour = "Gender")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data = masterdata, aes(x = maxTemp, y = distMeters, colour = gender)) + geom_violin() + labs(x = "Maximum Temperature", y = "Ride Distance (meters)", colour = "Gender")
ggplot(data = masterdata, aes(x = maxTemp, y = distMeters, colour = gender)) + geom_boxplot() + labs(x = "Maximum Temperature", y = "Ride Distance (meters)", colour = "Gender")
#Ride Distance by Maximum Temperature by User Type
ggplot(data = masterdata, aes(x = maxTemp, y = distMeters, colour = usertype)) + geom_point() + labs(x = "Maximum Temperature", y = "Ride Distance (meters)", colour = "User Type")
ggplot(data = masterdata, aes(x = maxTemp, y = distMeters, colour = usertype)) + geom_smooth() + labs(x = "Maximum Temperature", y = "Ride Distance (meters)", colour = "User Type")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data = masterdata, aes(x = maxTemp, y = distMeters, colour = usertype)) + geom_violin() + labs(x = "Maximum Temperature", y = "Ride Distance (meters)", colour = "User Type")
ggplot(data = masterdata, aes(x = maxTemp, y = distMeters, colour = usertype)) + geom_boxplot() + labs(x = "Maximum Temperature", y = "Ride Distance (meters)", colour = "User Type")
#Ride Distance by Minimum Temperature by Gender
ggplot(data = masterdata, aes(x = minTemp, y = distMeters, colour = gender)) + geom_point() + labs(x = "Minimum Temperature", y = "Ride Distance (meters)", colour = "Gender")
ggplot(data = masterdata, aes(x = minTemp, y = distMeters, colour = gender)) + geom_smooth() + labs(x = "Minimum Temperature", y = "Ride Distance (meters)", colour = "Gender")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data = masterdata, aes(x = minTemp, y = distMeters, colour = gender)) + geom_violin() + labs(x = "Minimum Temperature", y = "Ride Distance (meters)", colour = "Gender")
ggplot(data = masterdata, aes(x = minTemp, y = distMeters, colour = gender)) + geom_boxplot() + labs(x = "Minimum Temperature", y = "Ride Distance (meters)", colour = "Gender")
#Ride Distance by Minimum Temperature by User Type
ggplot(data = masterdata, aes(x = minTemp, y = distMeters, colour = usertype)) + geom_point() + labs(x = "Minimum Temperature", y = "Ride Distance (meters)", colour = "User Type")
ggplot(data = masterdata, aes(x = minTemp, y = distMeters, colour = usertype)) + geom_smooth() + labs(x = "Minimum Temperature", y = "Ride Distance (meters)", colour = "User Type")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data = masterdata, aes(x = minTemp, y = distMeters, colour = usertype)) + geom_violin() + labs(x = "Minimum Temperature", y = "Ride Distance (meters)", colour = "User Type")
ggplot(data = masterdata, aes(x = minTemp, y = distMeters, colour = usertype)) + geom_boxplot() + labs(x = "Minimum Temperature", y = "Ride Distance (meters)", colour = "User Type")
#By Maximum Temperature By Gender
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = maxTemp, y = tripduration, colour = gender)) + geom_point() + labs(x = "Maximum Temperature", y = "Trip Duration (seconds)", colour = "Gender")
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = maxTemp, y = tripduration, colour = gender)) + geom_smooth() + labs(x = "Maximum Temperature", y = "Trip Duration (seconds)", colour = "Gender")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = maxTemp, y = tripduration, colour = gender)) + geom_violin() + labs(x = "Maximum Temperature", y = "Trip Duration (seconds)", colour = "Gender")
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = maxTemp, y = tripduration, colour = gender)) + geom_boxplot() + labs(x = "Maximum Temperature", y = "Trip Duration (seconds)", colour = "Gender")
#By Maximum Temperature By User Type
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = maxTemp, y = tripduration, colour = usertype)) + geom_point() + labs(x = "Maximum Temperature", y = "Trip Duration (seconds)", colour = "User Type")
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = maxTemp, y = tripduration, colour = usertype)) + geom_smooth() + labs(x = "Maximum Temperature", y = "Trip Duration (seconds)", colour = "User Type")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = maxTemp, y = tripduration, colour = usertype)) + geom_violin() + labs(x = "Maximum Temperature", y = "Trip Duration (seconds)", colour = "User Type")
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = maxTemp, y = tripduration, colour = usertype)) + geom_boxplot() + labs(x = "Maximum Temperature", y = "Trip Duration (seconds)", colour = "User Type")
#By Minimum Temperature By Gender
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = minTemp, y = tripduration, colour = gender)) + geom_point() + labs(x = "Minimum Temperature", y = "Trip Duration (seconds)", colour = "Gender")
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = minTemp, y = tripduration, colour = gender)) + geom_smooth() + labs(x = "Minimum Temperature", y = "Trip Duration (seconds)", colour = "Gender")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = minTemp, y = tripduration, colour = gender)) + geom_violin() + labs(x = "Minimum Temperature", y = "Trip Duration (seconds)", colour = "Gender")
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = minTemp, y = tripduration, colour = gender)) + geom_boxplot() + labs(x = "Minimum Temperature", y = "Trip Duration (seconds)", colour = "Gender")
#By Minimum Temperature By User Type
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = minTemp, y = tripduration, colour = usertype)) + geom_point() + labs(x = "Minimum Temperature", y = "Trip Duration (seconds)", colour = "User Type")
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = minTemp, y = tripduration, colour = usertype)) + geom_smooth() + labs(x = "Minimum Temperature", y = "Trip Duration (seconds)", colour = "User Type")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = minTemp, y = tripduration, colour = usertype)) + geom_violin() + labs(x = "Minimum Temperature", y = "Trip Duration (seconds)", colour = "User Type")
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = minTemp, y = tripduration, colour = usertype)) + geom_boxplot() + labs(x = "Minimum Temperature", y = "Trip Duration (seconds)", colour = "User Type")
#By Day of Week
masterdata %>%
group_by(startDate, numWeekday) %>%
summarise(
numRides = mean(n())
) %>%
ggplot(aes(x = numWeekday, y = numRides)) + geom_boxplot() + labs(x = "Weekday (1 = Sunday, 7 = Saturday)", y = "Average Number of CitiBike Rides")
## `summarise()` regrouping output by 'startDate' (override with `.groups` argument)
#By Maximum Temperature
masterdata %>%
group_by(maxTemp) %>%
summarise(
numRides = mean(n())
) %>%
ggplot(aes(x = maxTemp, y = numRides)) + geom_point() + labs(x = "Maximum Temperature (F)", y = "Average Number of CitiBike Rides")
## `summarise()` ungrouping output (override with `.groups` argument)
#By Minimum Temperature
masterdata %>%
group_by(minTemp) %>%
summarise(
numRides = mean(n())
) %>%
ggplot(aes(x = minTemp, y = numRides)) + geom_point() + labs(x = "Minimum Temperature (F)", y = "Average Number of CitiBike Rides")
## `summarise()` ungrouping output (override with `.groups` argument)
#By Day of Week by Temperature (Not super meaningful - keeping this here as a template)
masterdata %>%
group_by(weekNum, numWeekday) %>%
summarise(
numRides = mean(n()),
temp = maxTemp
) %>%
ggplot(aes(x = temp, y = numRides)) + geom_boxplot() + transition_time(weekNum) + labs(title = "Week Number: {frame_time}")
## `summarise()` regrouping output by 'weekNum', 'numWeekday' (override with `.groups` argument)
#Speed by Maximum Temperature by Gender
ggplot(data = masterdata, aes(x = maxTemp, y = speedMetersperSec, colour = gender)) + geom_point() + labs(x = "Maximum Temperature", y = "Speed (meters/s)", colour = "Gender")
ggplot(data = masterdata, aes(x = maxTemp, y = speedMetersperSec, colour = gender)) + geom_smooth() + labs(x = "Maximum Temperature", y = "Speed (meters/s)", colour = "Gender")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data = masterdata, aes(x = maxTemp, y = speedMetersperSec, colour = gender)) + geom_violin() + labs(x = "Maximum Temperature", y = "Speed (meters/s)", colour = "Gender")
ggplot(data = masterdata, aes(x = maxTemp, y = speedMetersperSec, colour = gender)) + geom_boxplot() + labs(x = "Maximum Temperature", y = "Speed (meters/s)", colour = "Gender")
#Speed by Maximum Temperature by User Type
ggplot(data = masterdata, aes(x = maxTemp, y = speedMetersperSec, colour = usertype)) + geom_point() + labs(x = "Maximum Temperature", y = "Speed (meters/s)", colour = "User Type")
ggplot(data = masterdata, aes(x = maxTemp, y = speedMetersperSec, colour = usertype)) + geom_smooth() + labs(x = "Maximum Temperature", y = "Speed (meters/s)", colour = "User Type")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data = masterdata, aes(x = maxTemp, y = speedMetersperSec, colour = usertype)) + geom_violin() + labs(x = "Maximum Temperature", y = "Speed (meters/s)", colour = "User Type")
ggplot(data = masterdata, aes(x = maxTemp, y = speedMetersperSec, colour = usertype)) + geom_boxplot() + labs(x = "Maximum Temperature", y = "Speed (meters/s)", colour = "User Type")
#Speed by Minimum Temperature by Gender
ggplot(data = masterdata, aes(x = minTemp, y = speedMetersperSec, colour = gender)) + geom_point() + labs(x = "Minimum Temperature", y = "Speed (meters/s)", colour = "Gender")
ggplot(data = masterdata, aes(x = minTemp, y = speedMetersperSec, colour = gender)) + geom_smooth() + labs(x = "Minimum Temperature", y = "Speed (meters/s)", colour = "Gender")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data = masterdata, aes(x = minTemp, y = speedMetersperSec, colour = gender)) + geom_violin() + labs(x = "Minimum Temperature", y = "Speed (meters/s)", colour = "Gender")
ggplot(data = masterdata, aes(x = minTemp, y = speedMetersperSec, colour = gender)) + geom_boxplot() + labs(x = "Minimum Temperature", y = "Speed (meters/s)", colour = "Gender")
#Speed by Minimum Temperature by User Type
ggplot(data = masterdata, aes(x = minTemp, y = speedMetersperSec, colour = usertype)) + geom_point() + labs(x = "Minimum Temperature", y = "Speed (meters/s)", colour = "User Type")
ggplot(data = masterdata, aes(x = minTemp, y = speedMetersperSec, colour = usertype)) + geom_smooth() + labs(x = "Minimum Temperature", y = "Speed (meters/s)", colour = "User Type")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data = masterdata, aes(x = minTemp, y = speedMetersperSec, colour = usertype)) + geom_violin() + labs(x = "Minimum Temperature", y = "Speed (meters/s)", colour = "User Type")
ggplot(data = masterdata, aes(x = minTemp, y = speedMetersperSec, colour = usertype)) + geom_boxplot() + labs(x = "Minimum Temperature", y = "Speed (meters/s)", colour = "User Type")
masterdata %>%
group_by(tempFeel, start.station.id) %>%
summarise(
numRides = n()
) %>%
arrange(desc(numRides)) %>%
slice(1:5) %>%
ggplot(aes(x = tempFeel, y = numRides, colour = start.station.id )) + geom_boxplot()
## `summarise()` regrouping output by 'tempFeel' (override with `.groups` argument)
masterdata %>%
group_by(tempFeel, end.station.id) %>%
summarise(
numRides = n()
) %>%
slice(1:5) %>%
ggplot(aes(x = reorder(end.station.id, numRides, na.rm = TRUE), y = numRides)) + geom_boxplot() + transition_states(tempFeel, transition_length = 2, state_length = 1) + enter_fade() + exit_shrink() + ease_aes('sine-in-out') + labs(title = "Weather Feel: {closest_state}")
## `summarise()` regrouping output by 'tempFeel' (override with `.groups` argument)
nrow(masterdata[masterdata$PRCP < .5,])
## [1] 187806
nrow(masterdata[masterdata$PRCP >= .5 & masterdata$PRCP < 1,])
## [1] 13168
nrow(masterdata[masterdata$PRCP >= 1 & masterdata$PRCP < 1.5,])
## [1] 2712
nrow(masterdata[masterdata$PRCP >= 1.5,])
## [1] 1831
These numbers will guide the analysis below, as it is important to note that, while the averages on the y-axis may provide suggest certain insights, looking at the confidence intervals at various ranges will be useful in drawing meaningful insights. As these metrics indicate, PRCP certainly has a negative correlation with number of rides that occur, which suggests that bikers in higher PRCP may not be reflective of the typical Citibike biker.
#prcp vs Haversine Distance (distMeters) by gender
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distMeters, colour=gender)) + geom_point()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distMeters, colour=gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distMeters, colour=gender)) + geom_violin()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distMeters, colour=gender)) + geom_boxplot()
#prcp vs tripduration by usertype
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distMeters, colour=usertype)) + geom_point()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distMeters, colour=usertype)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distMeters, colour=usertype)) + geom_violin()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distMeters, colour=usertype)) + geom_boxplot()
Amongst unknown genders, PRCP is associated with a decrease in distance. For males and females, there seems to be a decrease in distance as PRCP increases to a certain level, after which the rate of decrease diminishes. For females, the distance begins to increase, whereas for males it mostly plateaus. This, as seen previously, may be reflective of who is biking in these various PRCP ranges. In the middle range, we can infer that people try to minimize distance if they can feasibly. Perhaps as PRCP becomes drastic, only those with a need to bike will be out, who may be not be able to adjust the distance of their trip. The disparity between male response and female response here is curious. Customers, who are likely recreational/infrequent users, predictably decrease distance in correlation to increased PRCP. Subscribers reflect a response similar to the females mentioned previously.
ggplot(data=masterdata, aes(x=PRCP,y=distMeters)) + geom_point() + facet_wrap(~ startMonthFactor)
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distMeters)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distMeters,colour=usertype)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distMeters,colour=gender)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distMeters,colour=gender)) + geom_smooth() + facet_wrap(~ usertype)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distMeters,colour=usertype)) + geom_smooth() + facet_wrap(~ gender)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
In colder months and August, it seems that PRCP does not have significant correlation to average distance. In other months, the correlation fluctuates or steadily yields lower distance as PRCP increases. Interestingly, April which is a very rainy month traditionally seems to have the greatest fluctuation for distance’s correlation with PRCP. Customers seem reliably unaffected by PRCP values in aggregate, except for a few interesting examples in August and May. Subscribers, again, vary greatly in their response, which may suggest that we must look into the behavioral trends of specific users to gain a full picture. While most insights from this data is fundamentally speculative, it is interesting to note the disparity in how females, males, and unknown genders vary in their response to PRCP, when separated into usertypes. Female customers seem unbothered, while female subscribers decrease distances up until a certain point and then increase again (potentially due to only necessary rides being made, which are not responsive to PRCP changes). Male customers strongly decrease distance as PRCP increases, while male subscribers reflect a similar pattern as female subscribers (potentially due to the aforementioned insight). Similar insights are yielded by separating user types into genders.
ggplot(data=masterdata, aes(x=startDate, y=tripduration, colour=gender)) + geom_point()
#newStartDate vs tripduration by gender
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=startDate, y=tripduration, colour=gender)) + geom_point()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=startDate, y=tripduration, colour=gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=startDate, y=tripduration, colour=gender)) + geom_violin()
## Warning: position_dodge requires non-overlapping x intervals
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=startDate, y=tripduration, colour=gender)) + geom_boxplot()
#newStartDate vs tripduration by usertype
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=startDate, y=tripduration, colour=usertype)) + geom_point()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=startDate, y=tripduration, colour=usertype)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=startDate, y=tripduration, colour=usertype)) + geom_violin()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=startDate, y=tripduration, colour=usertype)) + geom_boxplot()
As one might expect, trip duration increases during warmer months and decreases as temperature drops; this suggests that traveling longer distances is either more necessary or enjoyable in warmer months. Females on average have longer trips than men. Unknown gender has the highest trip duration, and customers have higher trip durations than subscribers. Perhaps customers do not have to reveal their gender information, and perhaps these customers differ in ways other than just status as it pertains to their trip duration. Citibike managers should keep in mind that any sort of system overhauls, construction, or repair should be placed in a month with less demand so the company does not miss out on revenue from peak times.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_point()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_violin()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_boxplot()
#prcp vs tripduration by usertype
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_point()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_violin()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_boxplot()
As precipitation increases, trip duration decreases. Females again have higher average trip duration, but they seem to have a varied correlation to preciptation. Perhaps the rise/plateau at the high PRCP levels for both males and females is influenced by people who use Citibike out of necessity. This means that the primary decrease in trip duration as PRCP increases is logical, as people who can make their trips shorter will. However, beyond a certain point, the people who cannot adjust their travel will then be bringing up the overall average trip duration. Unknown genders, who may be those who are not regular users of Citibike, are likely casual bikers who will decrease their trip lengths as much as posssible, and this is what the visualization depicts. It is curious that customers have inconsistent correlation to PRCP values. Perhaps we can infer that some rain deters users from taking long trips, while there is a certain amount of rain that is considered pleasant; this certain amount can also be an amount where casual riders do not ride, and so only bikers who bike out of need are biking in the middle range. After this middle range, perhaps even those bikers begin having to compromise on their trip lengths. Biking speed may also fluctuate and be responsible for trip duration changes.
averagePRCPMonthly <- tapply(masterdata$PRCP,masterdata$startMonthFactor,mean,)
plot(averagePRCPMonthly,xlab="Month",ylab="Average PRCP")
averageTripDurationMonthly <- tapply(masterdata$tripduration,masterdata$startMonthFactor,mean,)
plot(averageTripDurationMonthly,xlab="Month",ylab="Average Trip Duration")
numTripsMonthly <- table(masterdata$startMonth)
plot(x=averagePRCPMonthly, y=averageTripDurationMonthly)
plot(x=averagePRCPMonthly, y=numTripsMonthly)
ggplot(data=masterdata, aes(x=PRCP,y=tripduration)) + geom_point() + facet_wrap(~ startMonthFactor)
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration,colour=usertype)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration,colour=gender)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration,colour=gender)) + geom_smooth() + facet_wrap(~ usertype)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration,colour=usertype)) + geom_smooth() + facet_wrap(~ gender)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
It does not appear that months with higher average PRCP correspond to lower average trip durations. This may be due to the rainier months also being warmer and more pleasant than harsh winters of NY. Perhaps the pleasant days in rainy months are very positive for bikers in general, to the extent that they compensate for rainy days. We can see that, in different months, the amount of PRCP has varied correlations with trip duration. The winter months have little to know average tripduration changes as PRCP increases, which may reflect that bikers who ride during these times are not responsive to PRCP. Customers primarily decrease trip duration as PRCP increases, except in December and June, which may be months where tourists are determined to bike no matter the PRCP; subscribers vary greatly in their responses to PRCP in each month. Similar insights can be drawn when arranging the data by gender and usertype.
ggplot(data=masterdata, aes(x=PRCP,y=speedMetersperSec)) + geom_point()
ggplot(data=masterdata, aes(x=PRCP,y=speedMetersperSec)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata, aes(x=PRCP,y=speedMetersperSec, colour = gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata, aes(x=PRCP,y=speedMetersperSec, colour = usertype)) + geom_smooth() + facet_wrap(~ ageGroup)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
PRCP has a general positive correlation with speed, which may indicate that bikers bike faster in rainier weather. It is important to note certain fluctuations in this correlation. Perhaps the dip in speed around PRCP=1 may indicate that this amount of rain is particularly difficult to bike in, which causes bikers to slow down.
ggplot(data=masterdata[masterdata$distMeters < 10000,], aes(x=SNOW, y=distMeters, colour=usertype)) + geom_smooth() + labs(title = "Effects of Snow on Citi Bike Ride Distance", x = "Snow Depth (inches)", y = "Distance (meters)")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 544 rows containing non-finite values (stat_smooth).
ggplot(data=masterdata[masterdata$distMeters < 10000,], aes(x=SNOW, y=distMeters, colour=usertype)) + geom_point(alpha = 0.5) + geom_jitter() + labs(title = "Effects of Snow on Citi Bike Ride Distance", x = "Snow Depth (inches)", y = "Distance (meters)")
## Warning: Removed 544 rows containing missing values (geom_point).
## Warning: Removed 544 rows containing missing values (geom_point).
ggplot(data=masterdata[masterdata$distMeters < 10000,], aes(x=roundedSNOW, y=distMeters, colour=usertype)) + geom_point(alpha = 0.5) + geom_jitter() + labs(title = "Effects of Snow on Citi Bike Ride Distance", x = "Snow Depth (rounded inches)", y = "Distance (meters)")
## Warning: Removed 544 rows containing missing values (geom_point).
## Warning: Removed 544 rows containing missing values (geom_point).
We can see the trend that customers generally ride a consistent distance regardless of snow, while subscribers tend to travel shorter distances when there is snow. This is likely due to the fact that subscribers or frequent users are more likely to be locals to NYC and use other forms of transportation instead (i.e. subway).
However, if we also factor in the amount of rides that are happening, we can see that subscribers make up a larger percentage of the total rides compared to customers whenever there is snow on the ground. Subscribers who are more likely to use a CitiBike than a regular customer in the snow (but will also ride a shorter distance).
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=SNOW, y=tripduration, colour=usertype)) + geom_smooth() + labs(title = "Effects of Snow on Citi Bike Ride Duration", x = "Snow Depth (inches)", y = "Trip Duration (secs)")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 545 rows containing non-finite values (stat_smooth).
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=SNOW, y=tripduration, colour=usertype)) + geom_point(alpha = 0.5) + geom_jitter() + labs(title = "Effects of Snow on Citi Bike Ride Duration", x = "Snow Depth (inches)", y = "Trip Duration (secs)")
## Warning: Removed 545 rows containing missing values (geom_point).
## Warning: Removed 545 rows containing missing values (geom_point).
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=roundedSNOW, y=tripduration, colour=usertype)) + geom_point(alpha = 0.5) + geom_jitter() + labs(title = "Effects of Snow on Citi Bike Ride Duration", x = "Snow Depth (rounded inches)", y = "Trip Duration (secs)")
## Warning: Removed 545 rows containing missing values (geom_point).
## Warning: Removed 545 rows containing missing values (geom_point).
totalRides <- tapply(masterdata$tripduration, masterdata$SNOW, mean, na.rm = TRUE)
barplot(totalRides)
totalRides <- tapply(masterdata$tripduration, masterdata$roundedSNOW, mean, na.rm = TRUE)
barplot(totalRides)
Trip duration drastically decreases when there is snow on the ground this is likely due to a combination of temperature and safety concerns. However, we see a much steeper drop off in ridership and trip duration among regular customers than subscribers.
A potential solution to increase business would be to increase incentives for non-subscribers to ride when their is snow (is pricing a concern for them?) >> risk: liability?
# Count - Raw Data
ggplot(data=masterdata, aes(x=SNOW)) + geom_bar()
## Warning: Removed 545 rows containing non-finite values (stat_count).
ggplot(data=masterdata[masterdata$SNOW > 0, ], aes(x=SNOW)) + geom_bar()
## Warning: Removed 545 rows containing non-finite values (stat_count).
# Count - Rounded Values
ggplot(data=masterdata, aes(x=roundedSNOW)) + geom_bar()
## Warning: Removed 545 rows containing non-finite values (stat_count).
ggplot(data=masterdata[masterdata$roundedSNOW > 0, ], aes(x=roundedSNOW)) + geom_bar()
## Warning: Removed 545 rows containing non-finite values (stat_count).
# Average - Rounded Values
masterdata %>%
group_by(roundedSNOW) %>%
summarise(
numRides = mean(n())
) %>%
ggplot(aes(x = roundedSNOW, y = numRides)) + geom_point() + ylim(0,1000) + labs(x = "Snow Depth", y = "Average Number of CitiBike Rides")
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: Removed 2 rows containing missing values (geom_point).
As expected the more snow there is on the ground the less riders (on average per day with that amount of snow) there are. The second plots doesn’t include 0 values to remove days where there is no snow on the ground.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=SNOW, y=tripduration, colour=gender)) + geom_smooth() + labs(title = "Effects of Snow on Citi Bike Riders based on Gender", x = "Snow Depth (inches)", y = "Trip Duration (secs)")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 545 rows containing non-finite values (stat_smooth).
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=SNOW, y=tripduration, colour=gender)) + geom_point(alpha = 0.25) + geom_jitter() + labs(title = "Effects of Snow on Citi Bike Riders based on Gender", x = "Snow Depth (inches)", y = "Trip Duration (secs)")
## Warning: Removed 545 rows containing missing values (geom_point).
## Warning: Removed 545 rows containing missing values (geom_point).
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=roundedSNOW, y=tripduration, colour=gender)) + geom_point(alpha = 0.25) + geom_jitter() + labs(title = "Effects of Snow on Citi Bike Riders based on Gender", x = "Snow Depth (rounded inches)", y = "Trip Duration (secs)")
## Warning: Removed 545 rows containing missing values (geom_point).
## Warning: Removed 545 rows containing missing values (geom_point).
It seems that males are more likely to continue riding CitiBikes when there is snow on the ground than Females or Unknowns (which is expected as Unknowns fall largely in the non-subscriber category), but the difference is minimal.
ggplot(data=masterdata, aes(x=SNOW, y=speedMetersperSec, colour=usertype)) + geom_smooth() + labs(title = "Effects of Snow on Citi Bike Rider Speed", x = "Snow Depth (inches)", y = "Speed (meters per sec)")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 545 rows containing non-finite values (stat_smooth).
Subscribers are trying to get from point A to point B and actually go faster with more snow (likely to get out of the cold and we can assume NYC streets are well plowed despite indicated snow depth)
masterdata %>%
mutate(Timings = as.POSIXct(startTime)) %>%
group_by(lubridate::hour(Timings)) %>%
summarise(count=n()) %>%
arrange(desc(count))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 24 x 2
## `lubridate::hour(Timings)` count
## <int> <int>
## 1 17 20194
## 2 18 18831
## 3 8 16185
## 4 16 14545
## 5 19 13033
## 6 9 13020
## 7 15 12473
## 8 14 12190
## 9 13 11698
## 10 12 11129
## # ... with 14 more rows
hour <- format(as.POSIXct(masterdata$startTime, format="%H:%M:%S"),"%H")
hourie <- as.factor(hour)
go <- tapply(masterdata$tripduration, hourie, mean)
barplot(go,
main="Trip Duration by Month(Average)",
names.arg=c( "00", "01", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24"),
ylab="Average Duration",
col=c("red", "white", "blue", "green", "black", "yellow", "purple", "grey", "pink", "orange"),
)
hourtable <- table(hourie)
barplot(hourtable)
masterdata$hour <- hour
# Unique Departures and Arrivals for Each Citi Bike Station
StationStartsAM <- as.data.frame(table(masterAM$start.station.name))
StationEndsAM <- as.data.frame(table(masterAM$end.station.name))
StationDataAM <- data.frame(masterAM$start.station.name)
StationDataAM <- unique(StationDataAM)
StationDataAM$numStarts <- StationStartsAM$Freq[match(StationDataAM$masterAM.start.station.name, StationStartsAM$Var1)]
StationDataAM$numEnds <- StationEndsAM$Freq[match(StationDataAM$masterAM.start.station.name, StationEndsAM$Var1)]
# Compute the difference (Arrivals > Departures)
StationDataAM$difference <- StationDataAM$numEnds - StationDataAM$numStarts
StationDataAM <- arrange(StationDataAM, desc(difference))
StationDataAM <- na.omit(StationDataAM)
# Top 10 stations that gain bikes throughout the morning
TopTenSurplusAM <- head(StationDataAM, 10)
# Top 10 stations that lose bikes throughout the morning
TopTenDeficitAM <-tail(StationDataAM, 10)
TopTenDeficitAM <- arrange(TopTenDeficitAM, difference)
#Replicate for PM times
# Unique Departures and Arrivals for Each Citi Bike Station
StationStartsPM <- as.data.frame(table(masterPM$start.station.name))
StationEndsPM <- as.data.frame(table(masterPM$end.station.name))
StationDataPM <- data.frame(masterPM$start.station.name)
StationDataPM <- unique(StationDataPM)
StationDataPM$numStarts <- StationStartsPM$Freq[match(StationDataPM$masterPM.start.station.name, StationStartsPM$Var1)]
StationDataPM$numEnds <- StationEndsPM$Freq[match(StationDataPM$masterPM.start.station.name, StationEndsPM$Var1)]
# Compute the difference (Arrivals > Departures)
StationDataPM$difference <- StationDataPM$numEnds - StationDataPM$numStarts
StationDataPM <- arrange(StationDataPM, desc(difference))
StationDataPM <- na.omit(StationDataPM)
# Top 10 stations that gain bikes throughout the morning
TopTenSurplusPM <- head(StationDataPM, 10)
# Top 10 stations that lose bikes throughout the morning
TopTenDeficitPM <-tail(StationDataPM, 10)
TopTenDeficitPM <- arrange(TopTenDeficitPM, difference)
TopTenSurplusAM
## masterAM.start.station.name numStarts numEnds difference
## 1 Broadway & E 22 St 239 623 384
## 2 North Moore St & Greenwich St 108 485 377
## 3 E 47 St & Park Ave 242 582 340
## 4 W 52 St & 6 Ave 156 439 283
## 5 W 52 St & 5 Ave 77 355 278
## 6 6 Ave & Canal St 89 364 275
## 7 Grand Army Plaza & Central Park S 169 423 254
## 8 E 24 St & Park Ave S 252 472 220
## 9 E 48 St & 5 Ave 153 356 203
## 10 Broadway & Battery Pl 91 288 197
TopTenDeficitAM
## masterAM.start.station.name numStarts numEnds difference
## 1 8 Ave & W 31 St 709 260 -449
## 2 E 13 St & Avenue A 386 125 -261
## 3 E 10 St & Avenue A 353 108 -245
## 4 Christopher St & Greenwich St 449 270 -179
## 5 12 Ave & W 40 St 404 233 -171
## 6 E 6 St & Avenue B 269 98 -171
## 7 1 Ave & E 18 St 272 103 -169
## 8 E 7 St & Avenue A 311 142 -169
## 9 E 2 St & Avenue B 266 100 -166
## 10 E 20 St & FDR Drive 260 96 -164
TopTenSurplusPM
## masterPM.start.station.name numStarts numEnds difference
## 1 8 Ave & W 31 St 491 909 418
## 2 E 10 St & Avenue A 342 607 265
## 3 E 20 St & FDR Drive 262 450 188
## 4 E 2 St & Avenue B 281 465 184
## 5 E 13 St & Avenue A 464 639 175
## 6 E 6 St & Avenue B 284 454 170
## 7 1 Ave & E 16 St 494 655 161
## 8 St Marks Pl & 1 Ave 383 541 158
## 9 12 Ave & W 40 St 610 761 151
## 10 1 Ave & E 18 St 297 445 148
TopTenDeficitPM
## masterPM.start.station.name numStarts numEnds difference
## 1 North Moore St & Greenwich St 611 258 -353
## 2 Grand Army Plaza & Central Park S 636 311 -325
## 3 Broadway & E 22 St 886 571 -315
## 4 E 47 St & Park Ave 515 223 -292
## 5 W 52 St & 5 Ave 441 191 -250
## 6 W 52 St & 6 Ave 432 195 -237
## 7 E 48 St & 5 Ave 513 280 -233
## 8 E 24 St & Park Ave S 607 391 -216
## 9 6 Ave & Canal St 394 191 -203
## 10 Broadway & Battery Pl 372 210 -162
# Top Ten gains in the morning
TenSurplusMorning <- ggplot(TopTenSurplusAM, aes(reorder(masterAM.start.station.name, - difference), difference)) +
geom_col() +
scale_x_discrete(label = function(x) stringr::str_trunc(x, 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
TenSurplusMorning
this chart shows a high level of asymmetry among the bike stations with station “Broadwat & E 22 St” having 384 more arrivals than departures in the morning throughout the year.
# Top Ten loses in the morning
TenDeficitMorning <- ggplot(TopTenDeficitAM, aes(reorder(masterAM.start.station.name, difference), difference)) +
geom_col() +
scale_x_discrete(label = function(x) stringr::str_trunc(x, 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
TenDeficitMorning
Asymmetric traffic patterns are also evident in the chart above, with station “8 Ave & W 31 St” experiencing 449 more departures than arrivals in the morning throughout the year.
# Top Ten gains in the afternoon
TenSurplusAfternoon<- ggplot(TopTenSurplusPM, aes(reorder(masterPM.start.station.name, - difference), difference)) +
geom_col() +
scale_x_discrete(label = function(x) stringr::str_trunc(x, 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
TenSurplusAfternoon
The evidence of asymmetric traffic is also present in this chart, with station " 8 Ave & W 31 St " leading with 418 more arrivals than departures. Some symmetry between the two time periods is seen here with a loss of 449 bikes at this same station in the morning time period throughout the year. In fact, 8 of the top 10 stations that lose bikes in the morning period are present here gaining bikes.
# Top Ten loses in the afternoon
TenDeficitAfternoon <- ggplot(TopTenDeficitPM, aes(reorder(masterPM.start.station.name, difference), difference)) +
geom_col() +
scale_x_discrete(label = function(x) stringr::str_trunc(x, 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
TenDeficitAfternoon
In the chart for the top ten stations that experience losses all ten of the stations that experience overflow of bikes in the morning time perion are present here.
#create counts of data
# Unique Departures and Arrivals for Each Citi Bike Station for Week
StationStartsWeek <- as.data.frame(table(weekdayData$start.station.name))
StationEndsWeek <- as.data.frame(table(weekdayData$end.station.name))
StationDataWeek <- data.frame(weekdayData$start.station.name)
StationDataWeek <- unique(StationDataWeek)
StationDataWeek$numStarts <- StationStartsWeek$Freq[match(StationDataWeek$weekdayData.start.station.name, StationStartsWeek$Var1)]
StationDataWeek$numEnds <- StationEndsWeek$Freq[match(StationDataWeek$weekdayData.start.station.name, StationEndsWeek$Var1)]
# Compute the difference (Arrivals > Departures)
StationDataWeek$difference <- StationDataWeek$numEnds - StationDataWeek$numStarts
StationDataWeek <- arrange(StationDataWeek, desc(difference))
StationDataWeek <- na.omit(StationDataWeek)
# Top 10 stations that gain bikes throughout Weekdays
TopTenSurplusWeek <- head(StationDataWeek, 10)
# Top 10 stations that lose bikes throughout Weekend Mornings
TopTenDeficitWeek <-tail(StationDataWeek, 10)
TopTenDeficitWeek <- arrange(TopTenDeficitWeek, difference)
#create counts of data
# Unique Departures and Arrivals for Each Citi Bike Station for Weekend
StationStartsWeekend <- as.data.frame(table(weekendData$start.station.name))
StationEndsWeekend <- as.data.frame(table(weekendData$end.station.name))
StationDataWeekend <- data.frame(weekendData$start.station.name)
StationDataWeekend <- unique(StationDataWeekend)
StationDataWeekend$numStarts <- StationStartsWeekend$Freq[match(StationDataWeekend$weekendData.start.station.name, StationStartsWeekend$Var1)]
StationDataWeekend$numEnds <- StationEndsWeekend$Freq[match(StationDataWeekend$weekendData.start.station.name, StationEndsWeekend$Var1)]
# Compute the difference (Arrivals > Departures)
StationDataWeekend$difference <- StationDataWeekend$numEnds - StationDataWeekend$numStarts
StationDataWeekend <- arrange(StationDataWeekend, desc(difference))
StationDataWeekend <- na.omit(StationDataWeekend)
# Top 10 stations that gain bikes throughout Weekend Mornings
TopTenSurplusWeekend <- head(StationDataWeekend, 10)
# Top 10 stations that lose bikes throughout Weekend Mornings
TopTenDeficitWeekend <-tail(StationDataWeekend, 10)
TopTenDeficitWeekend <- arrange(TopTenDeficitWeekend, difference)
# Top Surplus During Weekday Mornings
TenSurplusWeek <- ggplot(TopTenSurplusWeek, aes(reorder(weekdayData.start.station.name, - difference), difference)) +
geom_col() +
scale_x_discrete(label = function(x) stringr::str_trunc(x, 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
TenSurplusWeek
# Top Deficit during Week Mornings
TenDeficitWeek <- ggplot(TopTenDeficitWeek, aes(reorder(weekdayData.start.station.name, difference), difference)) +
geom_col() +
scale_x_discrete(label = function(x) stringr::str_trunc(x, 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
TenDeficitWeek
# Top Surplus During Weekend Mornings
TenSurplusWeekend <- ggplot(TopTenSurplusWeekend, aes(reorder(weekendData.start.station.name, - difference), difference)) +
geom_col() +
scale_x_discrete(label = function(x) stringr::str_trunc(x, 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
TenSurplusWeekend
# Top Deficit during Weekend Mornings
TenDeficitWeekend <- ggplot(TopTenDeficitWeekend, aes(reorder(weekendData.start.station.name, difference), difference)) +
geom_col() +
scale_x_discrete(label = function(x) stringr::str_trunc(x, 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
TenDeficitWeekend
Between the top deficit stations in the morning of the weekends and weekdays there is only one overlapping station: E 13th and Avenue A., further the highest deficit on weekends is around 40 (average 20 a day [40/2]) whereas during the week the highest is approximately 450(average 90 a day [450/5]). Therefore this analysis shows that there is much more asymmetry during the week as opposed to the weekend, when less people commute for work. This pattern holds true for top surplus stations in the mornings of the weekend and weekdays with no overlapping stations,and the top station during weekdays with a surplus of 375 (average 75 a day [375/5])compared to a top surplus of 55 on the weekend (average 28 a day [55/2]). In summary, this supports the hypothesis that asymmetry is mainly caused by commutes to work during the week.
CountCustomerBroadway <- nrow(masterAM[masterAM$usertype == "Customer" & masterAM$start.station.name == "Broadway & E 22 St",])
CountSubscriberBroadway <- nrow(masterAM[masterAM$usertype == "Subscriber" & masterAM$start.station.name == "Broadway & E 22 St",])
CountCustomerNMoore <- nrow(masterPM[masterPM$usertype == "Customer" & masterPM$start.station.name == "North Moore St & Greenwich St",])
CountSubscriberNMoore <- nrow(masterPM[masterPM$usertype == "Subscriber" & masterPM$start.station.name == "North Moore St & Greenwich St",])
For stations that are asymmetric, users tend to be “subscribers” as opposed to “customers”. At the station “Broadway & E 22 St” of the 239 users who started trips there 230 are subscribers while only 9. For another station “North Moore St & Greenwich” there was only 61customers while there was 550 Subscribers.
# Add longitude and latitude to the dataset
startlatitude <- c(40.7403432,40.72019521,40.75510267, 40.76132983,40.75992262,40.72243797,40.7643971,40.74096374, 40.75724568, 40.70463334)
TopTenSurplusAM$startlatitude <- startlatitude
startlongitude <- c(-73.98955109,-74.01030064,-73.97498696
,-73.97982001, -73.97648516, -74.00566443
, -73.97371465, -73.98602213, -73.97805914
, -74.01361706)
TopTenSurplusAM$startlongitude <- startlongitude
#Map TopTenSurplus AM
register_google(key = "AIzaSyDr6TG5wIRo6iXXvRbE0rV3n2EPx1jApRc")
## get station info
station.info <- TopTenSurplusAM %>%
group_by(masterAM.start.station.name) %>%
summarise(lat=as.numeric(startlatitude),
long=as.numeric(startlongitude),
difference = difference)
## `summarise()` ungrouping output (override with `.groups` argument)
## get map and plot station locations
newyork.map <- get_map(location= 'Lower Manhattan, New York',
maptype='roadmap', color='bw',source='google',zoom=12)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=Lower%20Manhattan,%20New%20York&zoom=12&size=640x640&scale=2&maptype=roadmap&language=en-EN&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Lower+Manhattan,+New+York&key=xxx
ggmap(newyork.map) +
geom_point(data=station.info,aes(x=long,y=lat,color= difference),size=5,alpha=0.75)+
scale_colour_gradient(high="red",low='green')+
theme(axis.ticks = element_blank(),axis.text = element_blank())+
xlab('')+ylab('')
As seen in the geographic groupings of the top asymmetric stations most of them are located in Lower Manhattan, specifically in areas like Midtown where there are many jobs which is true for pretty much all of these areas. Although this map only shows the top stations that gain bikes in the morning, this pattern is true for the asymmetric stations due to the overlap from the relationship between the stations.